/*
Package mash is for sketching sequence data to make it easier to compare to other sequence.

The package is named mash after the mash sketching algorithm, which is based on the MinHash algorithm.

Mash: fast genome and metagenome distance estimation using MinHash.
Ondov, B.D., Treangen, T.J., Melsted, P. et al.
Genome Biol 17, 132 (2016).
https://doi.org/10.1186/s13059-016-0997-x

Mash Screen: high-throughput sequence containment estimation for genome discovery.
Ondov, B., Starrett, G., Sappington, A. et al.
Genome Biol 20, 232 (2019).
https://doi.org/10.1186/s13059-019-1841-x

tl;dr for the papers above:

Comparing biological sequences is really hard because similar sequences can have frameshifts that make it impossible
to measure similarity using more common metric distances like hamming distance or levenshtein distance.

Bioinformatics and nlp researchers have come up with tons of string comparison algorithms that are better suited for
comparing biological sequences. For example poly already implements a few of them like the Needleman-Wunsch and Smith-Waterman
algorithms in our "align" package.

Mash is a different approach to comparing biological sequences. It uses a technique called sketching to reduce the
complexity of the sequence to a vector of hashes. The hashes are generated by sliding a window of size k along the
sequence and hashing each kmer. The hash is then stored in a vector of size s. The vector is sorted and the smallest
hash is kept. The process is repeated until the vector is full. The vector of hashes is the sketch.

The sketch is then compared to other sketches by counting the number of hashes that are the same between the two sketches.
The number of hashes that are the same is divided by the size of the sketch to get a distance between 0 and 1.

Hash vectors can only be compared to other hash vectors that use the same sliding window size.
Sketch size limits how many hashes can be stored in the vector and the return vector
will always be of length of the sketch size and filled the smallest hashes that were generated
and sorted from smallest to largest.

The larger the sketch size the more accurate the distance calculation will be but the longer it will take to calculate.

TTFN,
Tim
*/
package mash

import (
	"encoding/binary"
	"hash"
	"sort"
)

// murmur3 is a fast non-cryptographic hash algorithm that was also used in the original papers-> https://github.com/shenwei356/go-hashing-kmer-bench

// Mash is a collection of hashes of kmers from a given sequence.
type Mash struct {
	KmerSize   int       // The kmer size is the size of the sliding window that is used to generate the hashes.
	SketchSize int       // The sketch size is the number of hashes to store.
	Sketches   []uint32  // The sketches are the hashes of the kmers that we can compare to other sketches.
	Hash       hash.Hash // Hash is the go standard library hashing interface. Can be used to switch algorithms.
}

// New initializes a new mash sketch.
func New(kmerSize int, sketchSize int, hashInterface hash.Hash) *Mash {
	return &Mash{
		KmerSize:   kmerSize,
		SketchSize: sketchSize,
		Sketches:   make([]uint32, sketchSize),
		Hash:       hashInterface,
	}
}

// Sketch generates a mash sketch of the sequence.
func (mash *Mash) Sketch(sequence string) {
	// the sketch size is the number of hashes to store. Pre-shifted to avoid off-by-one errors.
	maxShiftedSketchSize := mash.SketchSize - 1

	// slide a window of size k along the sequence
	for kmerStart := 0; kmerStart < len(sequence)-mash.KmerSize; kmerStart++ {
		kmer := sequence[kmerStart : kmerStart+mash.KmerSize]
		// hash the kmer to a 32 bit number
		hashBytes := mash.Hash.Sum([]byte(kmer))
		hash := binary.BigEndian.Uint32(hashBytes[:4]) // need to convert []byte to uint32
		// keep the minimum hash value of all the kmers in the window up to a given sketch size
		// the sketch is a vector of the minimum hash values

		// if the sketch is not full, store the hash in the sketch
		if kmerStart < maxShiftedSketchSize {
			mash.Sketches[kmerStart] = hash
			continue
		}

		// if the sketch has just been filled add the hash to the sketch and sort the sketch
		if kmerStart == maxShiftedSketchSize {
			// sort the sketch from smallest to largest
			mash.Sketches[maxShiftedSketchSize] = hash
			sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] })
			continue
		}

		// if the sketch is full and the new hash is smaller than the largest hash in the sketch,
		// replace the largest hash with the new hash and sort the sketch if the new hash is smaller than the second largest hash in the sketch
		if kmerStart > maxShiftedSketchSize && mash.Sketches[maxShiftedSketchSize] > hash {
			mash.Sketches[maxShiftedSketchSize] = hash
			if hash < mash.Sketches[maxShiftedSketchSize-1] { // if the new hash is smaller than the second largest hash in the sketch, sort the sketch
				sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] })
			}
			continue
		}
	}
}

// Similarity returns the Jaccard similarity between two sketches (number of matching hashes / sketch size)
func (mash *Mash) Similarity(other *Mash) float64 {
	var sameHashes int
	largerSketch := mash
	smallerSketch := other

	if mash.SketchSize < other.SketchSize {
		largerSketch = other
		smallerSketch = mash
	}

	if largerSketch.Sketches[largerSketch.SketchSize-1] < smallerSketch.Sketches[0] || smallerSketch.Sketches[smallerSketch.SketchSize-1] < largerSketch.Sketches[0] {
		return 0
	}

	smallSketchIndex, largeSketchIndex := 0, 0
	for smallSketchIndex < smallerSketch.SketchSize && largeSketchIndex < largerSketch.SketchSize {
		if smallerSketch.Sketches[smallSketchIndex] == largerSketch.Sketches[largeSketchIndex] {
			sameHashes++
			smallSketchIndex++
			largeSketchIndex++
		} else if smallerSketch.Sketches[smallSketchIndex] < largerSketch.Sketches[largeSketchIndex] {
			smallSketchIndex++
		} else {
			largeSketchIndex++
		}
	}

	return float64(sameHashes) / float64(smallerSketch.SketchSize)
}

// Distance returns the Jaccard distance between two sketches (1 - similarity)
func (mash *Mash) Distance(other *Mash) float64 {
	return 1 - mash.Similarity(other)
}
